SETUP
# Clear workspace
rm(list=ls())
# Load required libraries
library(tidyverse)
library(lubridate)
library(janitor)
library(rvest)
library(httr)
library(polite)
library(data.table)
library(ggplot2)
library(scales)
library(plotly)
# Set working directory
# setwd("/Users/helenguo/Documents")
WEB SCRAPE AND CLEAN
# Reference the Wikipedia page of natural disasters by death toll
url <- "https://en.wikipedia.org/wiki/List_of_natural_disasters_by_death_toll"
# Read HTML code from the page
url_bow <- polite::bow(url)
# Scrape tables from all sections
ind_html <-
polite::scrape(url_bow) %>% # Scrape web page
rvest::html_nodes("table.wikitable") %>% # Pull out wiki tables
rvest::html_table(fill = TRUE)
# Extract tables 2 and 3 (20th and 21st century all cause disasters)
century_20_disasters <- data.frame(ind_html[2])
century_21_disasters <- data.frame(ind_html[3])
# Merge 20th and 21st century data frames together
century_20_21_disasters <- rbind(century_20_disasters, century_21_disasters)
# Select relevant columns - year, death toll, and event type
final_variables <- century_20_21_disasters[,c("Year", "Death.toll","Type")]
# Functions to clean death toll strings
convert_death_toll <- function(Death.toll) {
# Remove commas and plus signs from the string
cleaned_string_toll_1 <- gsub("[,\\+]", "", Death.toll)
# Remove Wikipedia references in brackets from the string
cleaned_string_toll_2 <- gsub("\\[.*?\\]", "", cleaned_string_toll_1)
}
# Apply the cleaning function to the death toll values
converted_death_toll <- data.frame(sapply(final_variables, convert_death_toll))
# Convert the death toll to numbers using the midpoints when a range is given and the bound when an upper or lower bound is given (example 20,000+ converts to 20000)
converted_death_toll$Death.toll <- sapply(strsplit(converted_death_toll$Death.toll, split = "–"),
function(x) mean(as.numeric(x)/1000))
CREATE PLOT
# Plot the death toll (vertical / y axis) by year (horizontal / x axis) color coded by type of disaster
plot <- ggplot(converted_death_toll, aes(x = Year, y = Death.toll, color = Type)) +
geom_point() +
geom_line() +
labs(x = "Year", y = "Death Toll (Thousands)", color = "Type of Disaster") +
scale_x_discrete(guide = guide_axis(n.dodge = 2)) +
ggtitle("Death Toll by Year and Type of Disaster") +
theme(plot.title = element_text(hjust = 0.5), axis.text.x = element_text(angle = 30, hjust = 0.5, vjust = 0.5))
# Transform plot into visible (longer) interactive version using ggplotly
# Scroll right to view entirety of plot
p <- ggplotly(plot)
p